Analysis

Creating the Linear Model

library(tidyverse)
library(readr)

train <- read.csv("train.csv", stringsAsFactors = TRUE)

train <- train %>%
  mutate(Alley = as.character(Alley),
         Alley = replace_na(Alley, "None"),
         Alley = as.factor(Alley)) %>%
  mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF,
                RichNbrhd = case_when(Neighborhood %in% c("StoneBr", "NridgHt", "NoRidge") ~ 1,
                             TRUE ~ 0)) %>% 
  mutate(TotalBaths = FullBath + HalfBath/2 + BsmtFullBath + BsmtHalfBath/2)

First take a look at the pairs plot. Pairs Plot

Looks like TotalSF, GrLiveArea, YearBuilt, YearRemodAdd, and GarageArea all look like they effect the SalesPrice. We can test the lm.

mylm = lm(SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea + 
            TotalSF:GrLivArea + TotalSF:YearBuilt + TotalSF:YearRemodAdd,data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + 
##     GarageArea + TotalSF:GrLivArea + TotalSF:YearBuilt + TotalSF:YearRemodAdd, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -363670  -18979    -993   16389  365061 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.110e+06  3.805e+05   8.173 6.48e-16 ***
## TotalSF              -2.028e+03  1.513e+02 -13.402  < 2e-16 ***
## GrLivArea             7.466e+01  5.169e+00  14.443  < 2e-16 ***
## YearBuilt            -3.302e+02  1.441e+02  -2.292   0.0221 *  
## YearRemodAdd         -1.271e+03  2.092e+02  -6.076 1.57e-09 ***
## GarageArea            4.743e+01  6.190e+00   7.661 3.35e-14 ***
## TotalSF:GrLivArea    -1.409e-02  1.059e-03 -13.308  < 2e-16 ***
## TotalSF:YearBuilt     2.744e-01  5.535e-02   4.957 8.01e-07 ***
## TotalSF:YearRemodAdd  7.813e-01  8.754e-02   8.924  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38470 on 1451 degrees of freedom
## Multiple R-squared:  0.7667, Adjusted R-squared:  0.7654 
## F-statistic: 596.2 on 8 and 1451 DF,  p-value: < 2.2e-16

The lm shows promise so some interactions where tested.

mylm = lm(SalePrice ~ 
            
            TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea + 
            TotalSF:(GrLivArea + YearBuilt + YearRemodAdd) +
            GrLivArea:(YearBuilt + GarageArea) +
            YearBuilt:(GarageArea) +
            TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea
          
          ,data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + 
##     GarageArea + TotalSF:(GrLivArea + YearBuilt + YearRemodAdd) + 
##     GrLivArea:(YearBuilt + GarageArea) + YearBuilt:(GarageArea) + 
##     TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -406737  -16695     440   14573  253504 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                          2.607e+06  3.589e+05
## TotalSF                                             -2.037e+03  2.076e+02
## GrLivArea                                            6.383e+02  2.539e+02
## YearBuilt                                           -4.719e+02  1.344e+02
## YearRemodAdd                                        -8.032e+02  1.975e+02
## GarageArea                                          -1.295e+03  3.317e+02
## TotalSF:GrLivArea                                    1.804e-02  2.516e-03
## TotalSF:YearBuilt                                    4.783e-01  1.101e-01
## TotalSF:YearRemodAdd                                 5.711e-01  8.452e-02
## GrLivArea:YearBuilt                                 -3.613e-01  1.294e-01
## GrLivArea:GarageArea                                 1.840e-01  1.245e-02
## YearBuilt:GarageArea                                 6.041e-01  1.690e-01
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -7.204e-12  4.334e-13
##                                                     t value Pr(>|t|)    
## (Intercept)                                           7.264 6.12e-13 ***
## TotalSF                                              -9.812  < 2e-16 ***
## GrLivArea                                             2.514 0.012038 *  
## YearBuilt                                            -3.512 0.000459 ***
## YearRemodAdd                                         -4.067 5.02e-05 ***
## GarageArea                                           -3.903 9.92e-05 ***
## TotalSF:GrLivArea                                     7.171 1.18e-12 ***
## TotalSF:YearBuilt                                     4.345 1.49e-05 ***
## TotalSF:YearRemodAdd                                  6.757 2.03e-11 ***
## GrLivArea:YearBuilt                                  -2.792 0.005304 ** 
## GrLivArea:GarageArea                                 14.774  < 2e-16 ***
## YearBuilt:GarageArea                                  3.575 0.000362 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.621  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35030 on 1447 degrees of freedom
## Multiple R-squared:  0.8072, Adjusted R-squared:  0.8056 
## F-statistic: 504.9 on 12 and 1447 DF,  p-value: < 2.2e-16

This gives a better r squared, let start plotting the chart to find more variables.

ggplot(train, aes(y=SalePrice, x = TotalSF, color = RichNbrhd)) +
  geom_point()

Rich Neighborhood seems to be a good indicator of sales price, when including it in the model we get:

mylm = lm(SalePrice ~ 
            
            TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea + 
            TotalSF:(YearRemodAdd) +
            GrLivArea:(YearBuilt + GarageArea) +
            YearBuilt:(GarageArea) +
            TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
            
            RichNbrhd + RichNbrhd:TotalSF
          
          ,data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + 
##     GarageArea + TotalSF:(YearRemodAdd) + GrLivArea:(YearBuilt + 
##     GarageArea) + YearBuilt:(GarageArea) + TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea + 
##     RichNbrhd + RichNbrhd:TotalSF, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -213945  -15062     -82   13542  171844 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                          7.905e+05  3.377e+05
## TotalSF                                             -1.055e+03  1.363e+02
## GrLivArea                                            2.467e+02  1.219e+02
## YearBuilt                                            3.675e+02  1.051e+02
## YearRemodAdd                                        -7.425e+02  1.648e+02
## GarageArea                                          -9.082e+02  2.798e+02
## RichNbrhd                                           -1.770e+05  1.319e+04
## TotalSF:YearRemodAdd                                 5.549e-01  6.883e-02
## GrLivArea:YearBuilt                                 -1.295e-01  6.227e-02
## GrLivArea:GarageArea                                 1.033e-01  1.080e-02
## YearBuilt:GarageArea                                 4.347e-01  1.422e-01
## TotalSF:RichNbrhd                                    6.312e+01  3.718e+00
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -3.348e-12  2.040e-13
##                                                     t value Pr(>|t|)    
## (Intercept)                                           2.341 0.019385 *  
## TotalSF                                              -7.736 1.90e-14 ***
## GrLivArea                                             2.024 0.043183 *  
## YearBuilt                                             3.497 0.000484 ***
## YearRemodAdd                                         -4.505 7.18e-06 ***
## GarageArea                                           -3.246 0.001198 ** 
## RichNbrhd                                           -13.413  < 2e-16 ***
## TotalSF:YearRemodAdd                                  8.062 1.56e-15 ***
## GrLivArea:YearBuilt                                  -2.079 0.037749 *  
## GrLivArea:GarageArea                                  9.559  < 2e-16 ***
## YearBuilt:GarageArea                                  3.057 0.002277 ** 
## TotalSF:RichNbrhd                                    16.976  < 2e-16 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.416  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31520 on 1447 degrees of freedom
## Multiple R-squared:  0.8438, Adjusted R-squared:  0.8425 
## F-statistic: 651.5 on 12 and 1447 DF,  p-value: < 2.2e-16

This helped to increase the rsquared of the lm. Next I want to look at the baths in the house.

ggplot(train, aes(y=SalePrice, x = TotalSF, color = interaction(as.factor(TotalBaths)))) +
  geom_point() +
  facet_wrap(~interaction(as.factor(TotalBaths)))

Besides the 4.5 bath houses it seems that there is a relationship between the number of baths and the price of the house. Adding Total Baths to the lm we get:

mylm = lm(SalePrice ~ 
            
            TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea + 
            TotalSF:(YearRemodAdd) +
            GrLivArea:(YearBuilt + GarageArea) +
            YearBuilt:(GarageArea) +
            TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
            
            RichNbrhd + RichNbrhd:TotalSF +
            
            TotalBaths + RichNbrhd:TotalBaths
          
          ,data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + 
##     GarageArea + TotalSF:(YearRemodAdd) + GrLivArea:(YearBuilt + 
##     GarageArea) + YearBuilt:(GarageArea) + TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea + 
##     RichNbrhd + RichNbrhd:TotalSF + TotalBaths + RichNbrhd:TotalBaths, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -210966  -14860    -234   12922  174373 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                          1.169e+06  3.383e+05
## TotalSF                                             -1.160e+03  1.346e+02
## GrLivArea                                            2.929e+02  1.201e+02
## YearBuilt                                            3.120e+02  1.047e+02
## YearRemodAdd                                        -8.829e+02  1.631e+02
## GarageArea                                          -9.708e+02  2.749e+02
## RichNbrhd                                           -2.147e+05  1.560e+04
## TotalBaths                                           7.763e+03  1.585e+03
## TotalSF:YearRemodAdd                                 6.076e-01  6.797e-02
## GrLivArea:YearBuilt                                 -1.553e-01  6.134e-02
## GrLivArea:GarageArea                                 9.746e-02  1.064e-02
## YearBuilt:GarageArea                                 4.701e-01  1.397e-01
## TotalSF:RichNbrhd                                    5.654e+01  4.076e+00
## RichNbrhd:TotalBaths                                 2.110e+04  5.185e+03
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -3.258e-12  2.007e-13
##                                                     t value Pr(>|t|)    
## (Intercept)                                           3.455 0.000565 ***
## TotalSF                                              -8.612  < 2e-16 ***
## GrLivArea                                             2.440 0.014819 *  
## YearBuilt                                             2.980 0.002930 ** 
## YearRemodAdd                                         -5.414 7.20e-08 ***
## GarageArea                                           -3.531 0.000426 ***
## RichNbrhd                                           -13.766  < 2e-16 ***
## TotalBaths                                            4.898 1.08e-06 ***
## TotalSF:YearRemodAdd                                  8.939  < 2e-16 ***
## GrLivArea:YearBuilt                                  -2.533 0.011424 *  
## GrLivArea:GarageArea                                  9.160  < 2e-16 ***
## YearBuilt:GarageArea                                  3.364 0.000788 ***
## TotalSF:RichNbrhd                                    13.874  < 2e-16 ***
## RichNbrhd:TotalBaths                                  4.068 4.99e-05 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.233  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30960 on 1445 degrees of freedom
## Multiple R-squared:  0.8496, Adjusted R-squared:  0.8481 
## F-statistic: 583.1 on 14 and 1445 DF,  p-value: < 2.2e-16

After reviewing the lm it seems like GrLivArea and TotalSF are linked because the living area will scale with squarefootage, so GRLivArea will be removed. YearBuilt and GarageArea would have a interpritaiton of “the average price increases by $0.47 for every unit increase to either GarageArea or YearBuilt. Such a interpritation doesn’t make sense and removing it keeps the lm significant.

mylm = lm(SalePrice ~ 
            
            TotalSF + YearRemodAdd + GarageArea + 
            TotalSF:(YearBuilt) +
            TotalSF:YearBuilt:YearRemodAdd:GarageArea +
            
            RichNbrhd + RichNbrhd:TotalSF +
            
            TotalBaths + RichNbrhd:TotalBaths
          
          ,data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ TotalSF + YearRemodAdd + GarageArea + 
##     TotalSF:(YearBuilt) + TotalSF:YearBuilt:YearRemodAdd:GarageArea + 
##     RichNbrhd + RichNbrhd:TotalSF + TotalBaths + RichNbrhd:TotalBaths, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -248033  -16369   -1957   12971  253335 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                               -1.189e+06  1.077e+05 -11.034
## TotalSF                                   -1.363e+02  2.993e+01  -4.554
## YearRemodAdd                               5.937e+02  5.482e+01  10.829
## GarageArea                                 1.255e+02  9.644e+00  13.011
## RichNbrhd                                 -2.227e+05  1.665e+04 -13.381
## TotalBaths                                 1.044e+04  1.658e+03   6.296
## TotalSF:YearBuilt                          9.670e-02  1.549e-02   6.245
## TotalSF:RichNbrhd                          6.477e+01  4.400e+00  14.722
## RichNbrhd:TotalBaths                       1.993e+04  5.681e+03   3.509
## TotalSF:YearRemodAdd:GarageArea:YearBuilt -7.451e-09  8.413e-10  -8.857
##                                           Pr(>|t|)    
## (Intercept)                                < 2e-16 ***
## TotalSF                                   5.71e-06 ***
## YearRemodAdd                               < 2e-16 ***
## GarageArea                                 < 2e-16 ***
## RichNbrhd                                  < 2e-16 ***
## TotalBaths                                4.03e-10 ***
## TotalSF:YearBuilt                         5.57e-10 ***
## TotalSF:RichNbrhd                          < 2e-16 ***
## RichNbrhd:TotalBaths                      0.000463 ***
## TotalSF:YearRemodAdd:GarageArea:YearBuilt  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34230 on 1450 degrees of freedom
## Multiple R-squared:  0.8155, Adjusted R-squared:  0.8144 
## F-statistic: 712.3 on 9 and 1450 DF,  p-value: < 2.2e-16

Model Validation

set.seed(122)

num_rows <- 1000 #1460 total
keep <- sample(1:nrow(train), num_rows)

mytrain <- train[keep, ]

mytest <- train[-keep, ]

# Compute R-squared for each validation

  # Get y-hat for each model on new data.
  yht <- predict(mylm, newdata=mytest)
  
  # Compute y-bar
  ybar <- mean(mytest$SalePrice) #Yi is given by Ynew from the new sample of data
  
  # Compute SSTO
  SSTO <- sum( (mytest$SalePrice - ybar)^2 )
  
  # Compute SSE for each model using y - yhat
  SSEt <- sum( (mytest$SalePrice - yht)^2 )
  
  # Compute R-squared for each
  rst <- 1 - SSEt/SSTO
  
  # Compute adjusted R-squared for each
  n <- length(mytest$SalePrice) #sample size
  pt <- length(coef(mylm)) #num. parameters in model
  rsta <- 1 - (n-1)/(n-pt)*SSEt/SSTO
  

my_output_table2 <- data.frame(Model = c("My House Price Model"), `Original R2` = c(summary(mylm)$r.squared), `Orig. Adj. R-squared` = c(summary(mylm)$adj.r.squared), `Validation R-squared` = c(rst), `Validation Adj. R^2` = c(rsta))

colnames(my_output_table2) <- c("Model", "Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")

knitr::kable(my_output_table2, escape=TRUE, digits=4)
Model Original \(R^2\) Original Adj. \(R^2\) Validation \(R^2\) Validation Adj. \(R^2\)
My House Price Model 0.8155 0.8144 0.8591 0.8562

The validation shows that the model works well.

Final Model

My final model is: \[ \begin{aligned} \hat{Y} =\ & \beta_0 + \beta_1 X_{\text{TotalSF}} + \beta_2 X_{\text{YearRemodAdd}} + \beta_3 X_{\text{GarageArea}} \\ &+ \beta_4 (X_{\text{TotalSF}} \cdot X_{\text{YearBuilt}}) \\ &+ \beta_5 (X_{\text{TotalSF}} \cdot X_{\text{YearBuilt}} \cdot X_{\text{YearRemodAdd}} \cdot X_{\text{GarageArea}}) \\ & + \beta_6 X_{\text{RichNbrhd}} + \beta_7 (X_{\text{RichNbrhd}} \cdot X_{\text{TotalSF}}) \\ &+ \beta_8 X_{\text{TotalBaths}} + \beta_{9} (X_{\text{RichNbrhd}} \cdot X_{\text{TotalBaths}}) \end{aligned} \]

These can be interpreted in the following ways:

  • \(\beta_0\): Price when everything is zero.
  • \(\beta_1\): Change in Price per unit increase of TotalSF when everything else is zero.
  • \(\beta_2\): Change in Price per unit increase of YearRemodAdd when everything else is zero.
  • \(\beta_3\): Change in Price per unit increase of GarageArea when everything else is zero.
  • \(\beta_4\): Change in the TotalSF slope per unit increase of YearBuilt when everything else is zero.
  • \(\beta_5\): Change in the TotalSF slope per unit increase of YearBuilt × YearRemodAdd × GarageArea.
  • \(\beta_6\): Change in Price if the House is in a wealthy Neighborhood when everything else is zero
  • \(\beta_7\): Change in the TotalSF slope per unit increase of RichNbrhd when everything else is zero
  • \(\beta_8\): Change in Price per unit increase of TotalBaths when everything else is zero.
  • \(\beta_9\): Change in TotalBaths difference when the House is in a wealthy Neighborhood when everything else is zero

Model Plot

TotalSF v YearBuilt

The following plot uses the averages of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths, all rounded to the nearest whole number.

## Hint: library(car) has a scatterplot 3d function which is simple to use
#  but the code should only be run in your console, not knit.
library(plotly)
library(reshape2)
library(car)
## scatter3d(Y ~ X1 + X2, data=yourdata)



## To embed the 3d-scatterplot inside of your html document is harder.
#library(plotly)
#library(reshape2)

#Perform the multiple regression

#Graph Resolution (more important for more complex shapes)
graph_reso <- 100

#Setup Axis
axis_x <- seq(min(train$TotalSF), max(train$TotalSF), by = graph_reso)
axis_y <- seq(min(train$YearBuilt), max(train$YearBuilt), by = 1)

#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>% 
  mutate(YearRemodAdd = 1985,
         GarageArea = 473,
         RichNbrhd = 0,
         TotalBaths = 2)
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(train, 
        x = ~TotalSF, 
        y = ~YearBuilt, 
        z = ~SalePrice,
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = price_surface,
            x = axis_x,
            y = axis_y,
            type = "surface")

The following is a plot using the maximum of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths.

#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>% 
  mutate(YearRemodAdd = max(train$YearRemodAdd),
         GarageArea = max(train$GarageArea),
         RichNbrhd = max(train$RichNbrhd),
         TotalBaths = max(train$TotalBaths))
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(train, 
        x = ~TotalSF, 
        y = ~YearBuilt, 
        z = ~SalePrice,
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = price_surface,
            x = axis_x,
            y = axis_y,
            type = "surface")

The following is a plot using the minimum of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths.

#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>% 
  mutate(YearRemodAdd = min(train$YearRemodAdd),
         GarageArea = min(train$GarageArea),
         RichNbrhd = min(train$RichNbrhd),
         TotalBaths = min(train$TotalBaths))
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(train, 
        x = ~TotalSF, 
        y = ~YearBuilt, 
        z = ~SalePrice,
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = price_surface,
            x = axis_x,
            y = axis_y,
            type = "surface")